R for Data Science assessment

options(repos = "http://cran.us.r-project.org") 
#import and load the required packages
#install.packages("tidyverse")
#install.packages("plotly")
library(tidyverse)
## ── Attaching core tidyverse packages ─────────────────────────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.0     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.1.8
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ───────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
if (!require(ggpubr)) {
install.packages("ggpubr")
}
## Loading required package: ggpubr
if (!require(moments)) {
install.packages("moments")
}
## Loading required package: moments
library(ggpubr)

1. Reading the gapminder into a tibble

gapminder <- read_csv("gapminder_clean.csv")
## New names:
## Rows: 2607 Columns: 20
## ── Column specification
## ─────────────────────────────────────────────────────────────────────────── Delimiter: "," chr
## (2): Country Name, continent dbl (18): ...1, Year, Agriculture, value added (% of GDP), CO2
## emissions (me...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ Specify the column types
## or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
gapminder
## # A tibble: 2,607 × 20
##     ...1 Country…¹  Year Agric…² CO2 e…³ Domes…⁴ Elect…⁵ Energ…⁶ Expor…⁷ Ferti…⁸
##    <dbl> <chr>     <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1     0 Afghanis…  1962    NA    0.0738  21.3        NA      NA    4.88    7.45
##  2     1 Afghanis…  1967    NA    0.124    9.92       NA      NA    6.77    7.45
##  3     2 Afghanis…  1972    NA    0.131   18.9        NA      NA   14.8     7.45
##  4     3 Afghanis…  1977    NA    0.183   13.8        NA      NA   11.7     7.45
##  5     4 Afghanis…  1982    NA    0.166   NA          NA      NA   NA       7.45
##  6     5 Afghanis…  1987    NA    0.276   NA          NA      NA   NA       7.46
##  7     6 Afghanis…  1992    NA    0.101   NA          NA      NA   NA       7.50
##  8     7 Afghanis…  1997    NA    0.0608  NA          NA      NA   NA       7.64
##  9     8 Afghanis…  2002    38.5  0.0411  NA          NA      NA   32.4     7.27
## 10     9 Afghanis…  2007    30.6  0.0879   0.535      NA      NA   17.8     6.44
## # … with 2,597 more rows, 10 more variables: `GDP growth (annual %)` <dbl>,
## #   `Imports of goods and services (% of GDP)` <dbl>,
## #   `Industry, value added (% of GDP)` <dbl>,
## #   `Inflation, GDP deflator (annual %)` <dbl>,
## #   `Life expectancy at birth, total (years)` <dbl>,
## #   `Population density (people per sq. km of land area)` <dbl>,
## #   `Services, etc., value added (% of GDP)` <dbl>, pop <dbl>, …

2. Filtering and plotting the data

gapminder_filtered <- filter(gapminder, Year == 1962)
gapminder_filtered
## # A tibble: 259 × 20
##     ...1 Country…¹  Year Agric…² CO2 e…³ Domes…⁴ Elect…⁵ Energ…⁶ Expor…⁷ Ferti…⁸
##    <dbl> <chr>     <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1     0 Afghanis…  1962      NA  0.0738    21.3      NA      NA    4.88    7.45
##  2    10 Albania    1962      NA  1.44      NA        NA      NA   NA       6.28
##  3    20 Algeria    1962      NA  0.485     NA        NA      NA   19.8     7.61
##  4    30 American…  1962      NA NA         NA        NA      NA   NA      NA   
##  5    40 Andorra    1962      NA NA         NA        NA      NA   NA      NA   
##  6    50 Angola     1962      NA  0.216     NA        NA      NA   NA       7.40
##  7    60 Antigua …  1962      NA  1.82      NA        NA      NA   NA       4.34
##  8    70 Arab Wor…  1962      NA  0.761     18.2      NA      NA   NA       6.96
##  9    80 Argentina  1962      NA  2.52      17.3      NA      NA    4.69    3.09
## 10    90 Armenia    1962      NA NA         NA        NA      NA   NA       4.43
## # … with 249 more rows, 10 more variables: `GDP growth (annual %)` <dbl>,
## #   `Imports of goods and services (% of GDP)` <dbl>,
## #   `Industry, value added (% of GDP)` <dbl>,
## #   `Inflation, GDP deflator (annual %)` <dbl>,
## #   `Life expectancy at birth, total (years)` <dbl>,
## #   `Population density (people per sq. km of land area)` <dbl>,
## #   `Services, etc., value added (% of GDP)` <dbl>, pop <dbl>, …
gapminder_plot <- ggplot(gapminder_filtered, aes(x = gdpPercap,
  y = `CO2 emissions (metric tons per capita)`)) +
  geom_point()
  ggsave("gapminderplot.png")
## Saving 7 x 5 in image
## Warning: Removed 151 rows containing missing values (`geom_point()`).
gapminder_plot
## Warning: Removed 151 rows containing missing values (`geom_point()`).

3. Finding correlation

correlation <- cor(gapminder_filtered$`CO2 emissions (metric tons per capita)`,
  gapminder_filtered$gdpPercap, use = "complete.obs")
p_value <- cor.test(gapminder_filtered$`CO2 emissions (metric tons per capita)`,
  gapminder_filtered$gdpPercap)$p.value
correlation
## [1] 0.9260817
p_value
## [1] 1.128679e-46

4

#cor_by_year <- gapminder |>
#  group_by(Year) |>
#  summarise(cor(gapminder_filtered$`CO2 emissions (metric tons per capita)`,
#  gapminder_filtered$gdpPercap, use = "complete.obs"))

# strongest_year_row <- filter(cor_by_year, correlation == max(correlation))
# strongest_year_row
# strongest_year <- strongest_year_row$Year
# strongest_year

gapminder_filtered <- gapminder %>%
  filter(!is.na(`CO2 emissions (metric tons per capita)`), !is.na(gdpPercap))

cor_by_year <- gapminder_filtered %>%
  group_by(Year) %>%
  summarise(correlation = cor(`CO2 emissions (metric tons per capita)`, gdpPercap, use = "complete.obs"))
  
cor_by_year
## # A tibble: 10 × 2
##     Year correlation
##    <dbl>       <dbl>
##  1  1962       0.926
##  2  1967       0.939
##  3  1972       0.843
##  4  1977       0.793
##  5  1982       0.817
##  6  1987       0.810
##  7  1992       0.809
##  8  1997       0.808
##  9  2002       0.801
## 10  2007       0.720
strongest_year_row <- cor_by_year %>%
  filter(correlation == max(correlation)) %>%
  slice(1)

strongest_year <- strongest_year_row
strongest_year
## # A tibble: 1 × 2
##    Year correlation
##   <dbl>       <dbl>
## 1  1967       0.939

5

gapminder_strongest_year <- gapminder %>%
  filter(Year == 1967)

gapminder_strongest_year
## # A tibble: 259 × 20
##     ...1 Country…¹  Year Agric…² CO2 e…³ Domes…⁴ Elect…⁵ Energ…⁶ Expor…⁷ Ferti…⁸
##    <dbl> <chr>     <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1     1 Afghanis…  1967   NA      0.124    9.92      NA      NA    6.77    7.45
##  2    11 Albania    1967   NA      1.36    NA         NA      NA   NA       5.39
##  3    21 Algeria    1967   10.3    0.632   28.0       NA      NA   23.4     7.67
##  4    31 American…  1967   NA     NA       NA         NA      NA   NA      NA   
##  5    41 Andorra    1967   NA     NA       NA         NA      NA   NA      NA   
##  6    51 Angola     1967   NA      0.167   NA         NA      NA   NA       7.40
##  7    61 Antigua …  1967   NA      9.11    NA         NA      NA   NA       4.04
##  8    71 Arab Wor…  1967   NA      1.33    31.3       NA      NA   NA       6.93
##  9    81 Argentina  1967    9.98   2.86    18.7       NA      NA    7.50    3.05
## 10    91 Armenia    1967   NA     NA       NA         NA      NA   NA       3.61
## # … with 249 more rows, 10 more variables: `GDP growth (annual %)` <dbl>,
## #   `Imports of goods and services (% of GDP)` <dbl>,
## #   `Industry, value added (% of GDP)` <dbl>,
## #   `Inflation, GDP deflator (annual %)` <dbl>,
## #   `Life expectancy at birth, total (years)` <dbl>,
## #   `Population density (people per sq. km of land area)` <dbl>,
## #   `Services, etc., value added (% of GDP)` <dbl>, pop <dbl>, …
strongest_year_plot <- ggplot(gapminder_strongest_year, 
  aes(x = gdpPercap, y = `CO2 emissions (metric tons per capita)`, 
  color = continent, 
  size = pop)) +
  geom_point()
ggplotly(strongest_year_plot)

1

We want to test if there is a statistically significant difference in the Energy use per continent. Thus, continent is the predictor variable, and energy use is the outcome variable.

To use a parametric test, we must ensure that three assumptions are met: Normality, equal variances, and independence.

Normality assumption: To test for normality, we conduct a Shapiro-Wilk Test for Normality.

#Conduct Shapiro-Wilk Test for normality 
shapiro.test(gapminder$"Energy use (kg of oil equivalent per capita)")
## 
##  Shapiro-Wilk normality test
## 
## data:  gapminder$"Energy use (kg of oil equivalent per capita)"
## W = 0.62442, p-value < 2.2e-16

As we can see, the p value of the test is < 2.2e-16, which is less than the alpha level of 0.05. This suggests that the samples do not come from a normal distribution.

Therefore, we conduct a Kruskal-Wallis H test

kruskal.test(`Energy use (kg of oil equivalent per capita)` ~ continent, data = gapminder)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  Energy use (kg of oil equivalent per capita) by continent
## Kruskal-Wallis chi-squared = 318.68, df = 4, p-value < 2.2e-16

Thus can reject the null hypothesis that there is no difference in the mean ranks of energy use across continents.

ggplot(gapminder, aes(x = `Energy use (kg of oil equivalent per capita)`)) +
geom_histogram(bins = 30) +
facet_wrap(~continent) +
labs(title = "Histogram of energy use per capita by continent",
x = "Energy use (kg of oil equivalent per capita)",
y = "Frequency")
## Warning: Removed 1197 rows containing non-finite values (`stat_bin()`).

#convert ggplot2 plot to plotly plot
ggplotly()
## Warning: Removed 1197 rows containing non-finite values (`stat_bin()`).

2

Is there a significant difference between Europe and Asia with respect to ‘Imports of goods and services (% of GDP)’ in the years after 1990? (stats test needed)

Filtering the data

gapminder_years <- gapminder %>%
  filter(Year > 1990)

Plotting a qqplot

# ggqqplot(data = gapminder_years, x = 'Imports of goods and services (% of GDP)', facet.by = 'continent')

# Filter data for the years after 1990
data <- gapminder %>% 
filter(Year > 1990) %>% 
 select(continent, Year, `Imports of goods and services (% of GDP)`)

# Rename the variable for convenience
data <- data %>% 
rename(gdp = `Imports of goods and services (% of GDP)`)

# Plot Q-Q plot with facet by continent
ggqqplot(data = data, x = "gdp", facet.by = "continent")
## Warning: Removed 152 rows containing non-finite values (`stat_qq()`).
## Warning: Removed 152 rows containing non-finite values (`stat_qq_line()`).
## Removed 152 rows containing non-finite values (`stat_qq_line()`).

### Shapiro-Wilk Test for Normality

europe <- gapminder_years %>% 
filter(continent == "Europe")

asia <- gapminder_years %>% 
filter(continent == "Asia")

skewness(europe$'Imports of goods and services (% of GDP)', na.rm = TRUE)
## [1] 0.8351207
kurtosis(europe$'Imports of goods and services (% of GDP)', na.rm = TRUE)
## [1] 2.945533
shapiro.test(europe$'Imports of goods and services (% of GDP)'[complete.cases(data)]) 
## 
##  Shapiro-Wilk normality test
## 
## data:  europe$"Imports of goods and services (% of GDP)"[complete.cases(data)]
## W = 0.94204, p-value = 0.0124
skewness(asia$'Imports of goods and services (% of GDP)', na.rm = TRUE) 
## [1] 1.770528
kurtosis(asia$'Imports of goods and services (% of GDP)', na.rm = TRUE) 
## [1] 7.311968
shapiro.test(asia$'Imports of goods and services (% of GDP)'[complete.cases(data)]) 
## 
##  Shapiro-Wilk normality test
## 
## data:  asia$"Imports of goods and services (% of GDP)"[complete.cases(data)]
## W = 0.83268, p-value = 9.337e-06

As we can see, the p-values are less than 0.05. Therefore, we can reject the null hypothesis that the data is normally distributed, and we can apply a log transform.